# GSS_create_imputed.R 

# Part of the replication archive for 
#
#   Bullock, John G. 2020. "Education and Attitudes toward Redistribution in
#   the United States." British Journal of Political Science 50.


# This file should be called only after a dataset has been created by
# GSS_makeDatasetForImputation.R.  



# Coding rules:
# --Cases should be excluded from the missing-data imputation if they  
#   (a) are missing data on the dependent variable in my IV analyses, or (b) 
#   are for people who were never assigned to be asked the dependent-variable
#   question.  I code these cases as 0.  makePredictorMatrix() then omits them 
#   from the analysis.  (Note that makePredictorMatrix() omits cases that have 
#   "0" values *in the dependent variable*.  It does not pay attention to "0" 
#   values in other variables.)  


library(Amelia)
library(Bullock, lib.loc = c(.libPaths(), 'packageLibrary'))  # for Bullock::rescale() 
library(foreach)  # to use multiple cores
if (Sys.info()['sysname'] == 'Windows') { 
  library(doParallel)  # to use multiple cores
} 

source('functions/makePredictorMatrix.R')
load('data/GSS_datasetForImputation.RData')  # created by GSS_makeDatasetForImputation.R



####################################################################
# CHOOSE CANDIDATE VARIABLES AND RUN AMELIA
####################################################################
eqwlth.imput.df   <- makePredictorMatrix('eqwlth',   reqd.vars = reqd.vars, cand.vars, ID = 'respondentID')
goveqinc.imput.df <- makePredictorMatrix('goveqinc', reqd.vars = reqd.vars, cand.vars, ID = 'respondentID')
welfare.imput.df  <- makePredictorMatrix('welfare',  reqd.vars = reqd.vars, cand.vars, ID = 'respondentID')
helppoor.imput.df <- makePredictorMatrix('helppoor', reqd.vars = reqd.vars, cand.vars, ID = 'respondentID')

# RESCALE OUTCOMES FROM 0 TO 1  [2015 03 15]
# This rescaling should be done before the imputation.  But it should also be
# done after the "0" responses have been eliminated from the data frame -- 
# that is, after makePredictorMatrix() has been run.  
eqwlth.imput.df$eqwlth     <- Bullock::rescale(eqwlth.imput.df$eqwlth)
goveqinc.imput.df$goveqinc <- Bullock::rescale(goveqinc.imput.df$goveqinc)
welfare.imput.df$welfare   <- Bullock::rescale(welfare.imput.df$welfare)
helppoor.imput.df$helppoor <- Bullock::rescale(helppoor.imput.df$helppoor)

# CREATE MULTIPLE IMPUTATION FUNCTION
# Imputation is slow, and by default, it runs only one one core. The extra 
# code here makes use of multiple cores. For more information, see
# https://lists.gking.harvard.edu/pipermail/amelia/2012-July/000896.html.  
imputeMultiCore <- function (x, m, ...) {
  foreach (
    i             = 1:m,
    .combine      = "ameliabind",
    .multicombine = TRUE,
    .packages     = "Amelia",
    .inorder      = FALSE
  ) %dopar% {
    amelia(x, m = 1, ...)
  } 
}

# SET SEED
set.seed(1977)

# REGISTER MULTIPLE CORES
if (Sys.info()['sysname'] == 'Windows') {   
  cl <- makeCluster(4)
  registerDoParallel(cl)
}

# RUN AMELIA FOR THE GSS OUTCOMES
eqwlth.out <- imputeMultiCore(
  x        = eqwlth.imput.df, 
  m        = 10,  # number of imputed datasets 
  ts       = 'yearInt', 
  idvars   = 'respondentID',  
  noms     = c('race', 'wrkstat', 'wrkslf'),
  polytime = 1, 
  empri    = .00) 

goveqinc.out <- imputeMultiCore(
    x        = goveqinc.imput.df, 
    m        = 10, 
    ts       = 'yearInt', 
    idvars   = 'respondentID',  
    noms     = c('race', 'wrkstat', 'wrkslf', 'relig'),
    polytime = 1, 
    empri    = .00) 

welfare.out <- imputeMultiCore(
    x        = welfare.imput.df, 
    m        = 10, 
    ts       = 'yearInt', 
    idvars   = 'respondentID',  
    noms     = c('race', 'wrkstat', 'relig'),
    polytime = 1, 
    empri    = .00) 

helppoor.out <- imputeMultiCore(
    x        = helppoor.imput.df, 
    m        = 10, 
    ts       = 'yearInt', 
    idvars   = 'respondentID',  
    noms     = c('race', 'wrkstat'),
    # ords     = c('income', 'rincome'),
    polytime = 1, 
    empri    = .00) 


# SAVE IMPUTED DATA FRAMES
save(eqwlth.out,   file = paste0('data/GSSImputedDatasetEqwlth.RData'))
save(goveqinc.out, file = paste0('data/GSSImputedDatasetGoveqinc.RData'))
save(welfare.out,  file = paste0('data/GSSImputedDatasetWelfare.RData'))
save(helppoor.out, file = paste0('data/GSSImputedDatasetHelppoor.RData'))



####################################################################
# STOP THE CLUSTER AND QUIT
####################################################################
if (Sys.info()['sysname'] == 'Windows') stopCluster(cl) 
